
* This code is for cleaning the raw data extracted from the USPTO bulk data
* The resulting STATA files are "basic76.dta", "class76.dta", "invent76.dta", and "citing_cited76.dta".


*------------------------------------------------------------------------------*

*************
* basic.dta : It contains wku, apd, isd, ocl, nam_assg, first_ocl, second_ocl, subclass, and adate.
*************

* wku : patent number
* apd : application date
* isd : issue date
* ocl : primary US classification
* nam_assg : name of assignee
* first_ocl : the first 3 digit of ocl
* second_ocl : the next 3 digit of ocl (after the first 3 digit)
* subcalss : the next 6 digit of ocl (after the first 3 digit)
* adate : the e_d date of apd (days since Jan. 1st, 1960)


import delimited "/Users/khs/Documents/patent/raw_data/basic_1976_2001.csv", delimiter("*") clear
sa basic1, replace
import delimited "/Users/khs/Documents/patent/raw_data/basic_2002_2004.csv", delimiter("*") clear
sa basic2, replace
import delimited "/Users/khs/Documents/patent/raw_data/basic_2005_2014.csv", delimiter("*") clear
sa basic3, replace
import delimited "/Users/khs/Documents/patent/raw_data/basic_2015.csv", delimiter("*") clear
sa basic4, replace


forvalues i = 1/4 {
use basic`i', clear

replace nam_assg = nam_assg + v6 + v7
drop v6 v7

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

gen first_ocl = ocl
recast str3 first_ocl, force
destring first_ocl, replace force
drop if first_ocl == .

gen second_ocl = substr(ocl,4,3)
drop if second_ocl == ""
destring second_ocl, replace force
drop if second_ocl == .

gen subclass = substr(ocl,4,6)

gen day = apd - 100*floor(apd/100)
gen year = floor(apd/10000)
gen month = floor((apd - 10000*year)/100)
gen adate = mdy(month, day, year)
drop if adate == .
* Delete observations that do not have a proper application date

drop day year month
order wku apd isd ocl first_ocl second_ocl subclass adate

sa basic`i', replace

}


use basic1, clear

forvalues i = 2/4 {
append using basic`i'
}

duplicates drop

order wku apd isd ocl first_ocl second_ocl subclass adate
so wku

sa basic76, replace



*------------------------------------------------------------------------------*

*************
* class.dta : It contains wku, type, class, first_class, second_class, and subclass
*************

* wku : patent number
* type : OCL if primary class and XCL if additional class
* class : US classification (primary or additional)
* nam_assg : name of assignee
* first_class : the first 3 digit of class
* second_class : the next 3 digit of class (after the first 3 digit)
* subcalss : the next 6 digit of class (after the first 3 digit)


import delimited "/Users/khs/Documents/patent/raw_data/class_1976_2001.csv", delimiter("*") varnames(1) clear
sa class1, replace
import delimited "/Users/khs/Documents/patent/raw_data/class_2002_2004.csv", delimiter("*") varnames(1) clear
sa class2, replace
import delimited "/Users/khs/Documents/patent/raw_data/class_2005_2014.csv", delimiter("*") varnames(1) clear
sa class3, replace
import delimited "/Users/khs/Documents/patent/raw_data/class_2015.csv", delimiter("*") varnames(1) clear
sa class4, replace


forvalues i = 1/4 {
use class`i', clear

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

gen first_class = class
recast str3 first_class, force
destring first_class, replace force
drop if first_class == .

gen second_class = substr(class,4,3)
drop if second_class == ""
destring second_class, replace force
drop if second_class == .

gen subclass = substr(class,4,6)

sa class`i', replace
* As cleaning class information, we dropped 110030, 9045, 148138, and 7295 observations from class1, class2, class3, and class4, respectively.
* (The resulting "class.dta" file has 20101782 observations.)
}


use class1, clear

forvalues i = 2/4 {
append using class`i'
}

duplicates drop

order wku type class
so wku

sa class76, replace



*------------------------------------------------------------------------------*

**************
* invent.dta : It contains wku, nam_invt, cnt, sta, cty, and order
**************

* wku : patent number
* nam_assg : name of inventor
* cnt : country of inventor
* sta : state of inventor
* cty : city of inventor
* order : inventor order that listed in the USPTO bulk data for each patent


import delimited "/Users/khs/Documents/patent/raw_data/invent_1976_2001.csv", delimiter("*") varnames(1) clear
sa invent1, replace
import delimited "/Users/khs/Documents/patent/raw_data/invent_2002_2004.csv", delimiter("*") varnames(1) clear
sa invent2, replace
import delimited "/Users/khs/Documents/patent/raw_data/invent_2005_2014.csv", delimiter("*") varnames(1) clear
sa invent3, replace
import delimited "/Users/khs/Documents/patent/raw_data/invent_2015.csv", delimiter("*") varnames(1) clear
sa invent4, replace


use invent1, clear

gen x = _n

replace sta = cty if v6 != ""
replace cty = v6 if v6 != ""
drop v6

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

so wku x
by wku: gen order = _n
drop x

sa invent1, replace


use invent2, clear

gen x = _n

replace sta = "MI" if cnt == "US"
replace cty = "Ann Arbor" if cnt == "US"
replace cnt = "" if cnt == "US"
* One observation of US inventors does not have state and city name even if the inventor's location is US

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

so wku x
by wku: gen order = _n
drop x

sa invent2, replace


use invent3, clear

gen x = _n

replace sta = "CA" if nam_invt == "Ardeshir Bayat" & cnt == "USA"
replace cty = "Los Angeles" if nam_invt == "Ardeshir Bayat" & cnt == "USA"
replace cnt = "" if nam_invt == "Ardeshir Bayat" & cnt == "USA"
replace sta = "TX" if nam_invt == "Sung Jun Moon" & cnt == "USA"
replace cty = "Austin" if nam_invt == "Sung Jun Moon" & cnt == "USA"
replace cnt = "" if nam_invt == "Sung Jun Moon" & cnt == "USA"
replace sta = "CA" if nam_invt == "Michael J. Kim" & cnt == "USA"
replace cty = "Reseda" if nam_invt == "Michael J. Kim" & cnt == "USA"
replace cnt = "" if nam_invt == "Michael J. Kim" & cnt == "USA"
replace sta = "PR" if nam_invt == "Josue R. Crespo" & cnt == "USA"
replace cty = "Cabo Rojo" if nam_invt == "Josue R. Crespo" & cnt == "USA"
replace cnt = "" if nam_invt == "Josue R. Crespo" & cnt == "USA"
* Four observations of US inventors do not have state and city name even if their locatios is US

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

so wku x
by wku: gen order = _n
drop x

sa invent3, replace


use invent4, clear

gen x = _n

recast str8 wku, force
destring wku, force replace
drop if wku == .
* Remain only utility patents

so wku x
by wku: gen order = _n
drop x

drop if cnt == "None"
* Drop observations that do not have the location information

sa invent4, replace


use invent1, clear

forvalues i = 2/4 {
append using invent`i'
}

order wku cnt sta cty
so wku

sa invent76, replace



*------------------------------------------------------------------------------*

********************
* citing_cited.dta : It contains citing and cited
********************

* citing : citing patent number
* cited : cited patent number


import delimited "/Users/khs/Documents/patent/raw_data/cite_1976_2001.csv", delimiter("*") varnames(1) clear
sa citing_cited1, replace
import delimited "/Users/khs/Documents/patent/raw_data/cite_2002_2004.csv", delimiter("*") varnames(1) clear
sa citing_cited2, replace
import delimited "/Users/khs/Documents/patent/raw_data/cite_2005_2014.csv", delimiter("*") varnames(1) clear
sa citing_cited3, replace
import delimited "/Users/khs/Documents/patent/raw_data/cite_2015.csv", delimiter("*") varnames(1) clear
sa citing_cited4, replace


use citing_cited1, clear

replace cited = cited + string(v3) if v3 != .
drop v3

recast str8 citing, force
destring citing, force replace
drop if citing == .
* Remain only utility patents

destring cited, force replace
drop if cited == .
* Remain only utility patents

sa citing_cited1, replace


forvalues `i' = 2/4 {
use citing_cited`i', clear

recast str8 citing, force
destring citing, force replace
drop if citing == .
* Remain only utility patents

destring cited, force replace
drop if cited == .
* Remain only utility patents

sa citing_cited`i', replace
}


use citing_cited1, clear

forvalues i = 2/5 {
append using citing_cited`i'
}

duplicates drop

drop if cited < 1000000
* Drop if the patent number of cited patent is less than 1000000

so citing

sa citing_cited76, replace
